In [1]:
# --- Task 1: Understand the Dataset ---
# First, we tell Python we need our special data assistant, pandas.
import pandas as pd
# Now, we ask pandas to open and read our CSV case file.
# We'll store all the data in a variable called 'df' (short for DataFrame).
df = pd.read_csv(r"C:\Users\himanshu\Downloads\Telco_Customer_Churn_Dataset (3).csv")
print("File loaded successfully! Let's take a look...\n")
# 1. Display the first 10 rows [cite: 54]
# This is like peeking at the first 10 lines of our case file.
print("--- First 10 Rows of the Dataset ---")
print(df.head(10))
print("\n" + "="*50 + "\n")
# 2. Identify the data types of each column [cite: 55]
# This tells us if a column has numbers, text, etc.
print("--- Data Types of Each Column ---")
df.info()
print("\n" + "="*50 + "\n")
# 3. Check for missing values [cite: 56]
# This is our check for any blank spots in the file.
print("--- Missing Values in Each Column ---")
print(df.isnull().sum())
File loaded successfully! Let's take a look...
--- First 10 Rows of the Dataset ---
customerID gender SeniorCitizen Partner Dependents tenure PhoneService \
0 7590-VHVEG Female 0 Yes No 1 No
1 5575-GNVDE Male 0 No No 34 Yes
2 3668-QPYBK Male 0 No No 2 Yes
3 7795-CFOCW Male 0 No No 45 No
4 9237-HQITU Female 0 No No 2 Yes
5 9305-CDSKC Female 0 No No 8 Yes
6 1452-KIOVK Male 0 No Yes 22 Yes
7 6713-OKOMC Female 0 No No 10 No
8 7892-POOKP Female 0 Yes No 28 Yes
9 6388-TABGU Male 0 No Yes 62 Yes
MultipleLines InternetService OnlineSecurity ... DeviceProtection \
0 No phone service DSL No ... No
1 No DSL Yes ... Yes
2 No DSL Yes ... No
3 No phone service DSL Yes ... Yes
4 No Fiber optic No ... No
5 Yes Fiber optic No ... Yes
6 Yes Fiber optic No ... No
7 No phone service DSL Yes ... No
8 Yes Fiber optic No ... Yes
9 No DSL Yes ... No
TechSupport StreamingTV StreamingMovies Contract PaperlessBilling \
0 No No No Month-to-month Yes
1 No No No One year No
2 No No No Month-to-month Yes
3 Yes No No One year No
4 No No No Month-to-month Yes
5 No Yes Yes Month-to-month Yes
6 No Yes No Month-to-month Yes
7 No No No Month-to-month No
8 Yes Yes Yes Month-to-month Yes
9 No No No One year No
PaymentMethod MonthlyCharges TotalCharges Churn
0 Electronic check 29.85 29.85 No
1 Mailed check 56.95 1889.5 No
2 Mailed check 53.85 108.15 Yes
3 Bank transfer (automatic) 42.30 1840.75 No
4 Electronic check 70.70 151.65 Yes
5 Electronic check 99.65 820.5 Yes
6 Credit card (automatic) 89.10 1949.4 No
7 Mailed check 29.75 301.9 No
8 Electronic check 104.80 3046.05 Yes
9 Bank transfer (automatic) 56.15 3487.95 No
[10 rows x 21 columns]
==================================================
--- Data Types of Each Column ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7043 non-null object
1 gender 7043 non-null object
2 SeniorCitizen 7043 non-null int64
3 Partner 7043 non-null object
4 Dependents 7043 non-null object
5 tenure 7043 non-null int64
6 PhoneService 7043 non-null object
7 MultipleLines 7043 non-null object
8 InternetService 7043 non-null object
9 OnlineSecurity 7043 non-null object
10 OnlineBackup 7043 non-null object
11 DeviceProtection 7043 non-null object
12 TechSupport 7043 non-null object
13 StreamingTV 7043 non-null object
14 StreamingMovies 7043 non-null object
15 Contract 7043 non-null object
16 PaperlessBilling 7043 non-null object
17 PaymentMethod 7043 non-null object
18 MonthlyCharges 7043 non-null float64
19 TotalCharges 7043 non-null object
20 Churn 7043 non-null object
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB
==================================================
--- Missing Values in Each Column ---
customerID 0
gender 0
SeniorCitizen 0
Partner 0
Dependents 0
tenure 0
PhoneService 0
MultipleLines 0
InternetService 0
OnlineSecurity 0
OnlineBackup 0
DeviceProtection 0
TechSupport 0
StreamingTV 0
StreamingMovies 0
Contract 0
PaperlessBilling 0
PaymentMethod 0
MonthlyCharges 0
TotalCharges 0
Churn 0
dtype: int64
In [6]:
# --- Task 2: Data Cleaning ---
# Let's bring in our data assistant, pandas.
import pandas as pd
# First, we load our case file again to start fresh.
df = pd.read_csv(r"C:\Users\himanshu\Downloads\Telco_Customer_Churn_Dataset (3).csv")
print("File loaded. Beginning cleaning process...\n")
# --- Step 1: Handle missing/incorrect values in 'TotalCharges' ---
# We tell pandas to try and turn the 'TotalCharges' column into numbers.
# If it finds something that isn't a number (like a blank space), it will mark it as a blank spot (NaN).
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# Now, let's see if any blank spots appeared after that change.
print("--- Checking for blank spots after converting 'TotalCharges' to numbers ---")
print(df.isnull().sum())
print("\nAha! We found 11 blank spots in 'TotalCharges'!\n")
# Since these are blank, we'll fill them with the number 0.
df['TotalCharges'].fillna(0, inplace=True)
print("--- Blank spots have been filled with 0. Let's check again ---")
print(df.isnull().sum())
print("\nSuccess! No more blank spots.")
print("\n" + "="*50 + "\n")
# --- Step 2: Remove duplicate records ---
# Let's count how many duplicate rows we have.
print(f"Number of duplicate rows found: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True) # This command removes them.
print("Duplicate rows have been removed.\n")
print("="*50 + "\n")
# --- Step 3: Standardize column names ---
# This line takes all column names and makes them lowercase.
df.columns = [col.lower() for col in df.columns]
print("--- All column names are now standardized to lowercase ---")
print(df.columns.tolist())
File loaded. Beginning cleaning process... --- Checking for blank spots after converting 'TotalCharges' to numbers --- customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 11 Churn 0 dtype: int64 Aha! We found 11 blank spots in 'TotalCharges'! --- Blank spots have been filled with 0. Let's check again --- customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64 Success! No more blank spots. ================================================== Number of duplicate rows found: 0 Duplicate rows have been removed. ================================================== --- All column names are now standardized to lowercase --- ['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents', 'tenure', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod', 'monthlycharges', 'totalcharges', 'churn']
In [3]:
# --- Task 3: Exploratory Data Analysis (EDA) ---
# Part 0: SETUP - We'll get everything ready first.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')
# Set a nice style for our charts
sns.set_style("whitegrid")
# --- Load and Clean the Data (Prerequisite Steps) ---
# We do this again to ensure our data is fresh and correct.
df = pd.read_csv(r"C:\Users\himanshu\Downloads\Telco_Customer_Churn_Dataset (3).csv")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.fillna({'TotalCharges': 0}, inplace=True)
df.columns = [col.lower() for col in df.columns]
print("Data loaded and cleaned. Starting Task 3 analysis...\n")
# --- Step 1: Generate Summary Statistics ---
# This is like a quick report card for our main numerical columns.
print("--- Summary Statistics for Numerical Data ---")
print(df[['tenure', 'monthlycharges', 'totalcharges']].describe())
print("\n" + "="*50 + "\n")
# --- Step 2: Analyze Churn Rate ---
# Here we find out what percentage of customers left.
print("--- Churn Proportions ---")
churn_percentage = df['churn'].value_counts(normalize=True) * 100
print(churn_percentage)
print("\n")
# Now, let's make a simple bar chart to see the difference.
plt.figure(figsize=(8, 6))
sns.countplot(x='churn', data=df)
plt.title('Distribution of Customer Churn', fontsize=16)
plt.xlabel('Did the Customer Leave? (Churn)', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.show()
# --- Step 3: Create Visualizations for Numerical Columns ---
# We'll make a histogram and a box plot for each of our number columns.
numerical_features = ['tenure', 'monthlycharges', 'totalcharges']
for feature in numerical_features:
plt.figure(figsize=(14, 5))
# A histogram shows us the 'shape' of the data.
plt.subplot(1, 2, 1)
sns.histplot(df[feature], kde=True, bins=30)
plt.title(f'Histogram of {feature.capitalize()}', fontsize=14)
# A box plot shows us the 'spread' of the data in a different way.
plt.subplot(1, 2, 2)
sns.boxplot(x=df[feature])
plt.title(f'Box Plot of {feature.capitalize()}', fontsize=14)
plt.tight_layout()
plt.show()
Data loaded and cleaned. Starting Task 3 analysis...
--- Summary Statistics for Numerical Data ---
tenure monthlycharges totalcharges
count 7043.000000 7043.000000 7043.000000
mean 32.371149 64.761692 2279.734304
std 24.559481 30.090047 2266.794470
min 0.000000 18.250000 0.000000
25% 9.000000 35.500000 398.550000
50% 29.000000 70.350000 1394.550000
75% 55.000000 89.850000 3786.600000
max 72.000000 118.750000 8684.800000
==================================================
--- Churn Proportions ---
churn
No 73.463013
Yes 26.536987
Name: proportion, dtype: float64
In [4]:
# --- Task 4: Customer Segmentation Visualization ---
# Part 0: SETUP - Get our tools and data ready
import pandas as pd
import plotly.express as px # We're using Plotly Express for our cool charts!
import warnings
# Ignore warnings for a clean output
warnings.filterwarnings('ignore')
# --- Load and Clean the Data (Prerequisite Steps) ---
# This makes sure our data is fresh and correct before we start Task 4.
df = pd.read_csv(r"C:\Users\himanshu\Downloads\Telco_Customer_Churn_Dataset (3).csv")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.fillna({'TotalCharges': 0}, inplace=True)
df.columns = [col.lower() for col in df.columns]
print("Data loaded and cleaned. Starting Task 4 visualizations...\n")
# --- Part 1: Create Customer Groups Based on Tenure ---
# We'll slice our customers into three groups based on their 'tenure'.
print("--- Creating Tenure Groups ---")
# These are the "cut-off points" for our groups in months.
tenure_bins = [0, 12, 36, 73]
# These are the labels we'll give to each group.
tenure_labels = ['0-12 Months', '13-36 Months', '37+ Months']
# This command creates the new 'tenure_group' column and sorts each customer.
df['tenure_group'] = pd.cut(df['tenure'], bins=tenure_bins, labels=tenure_labels, include_lowest=True)
print("Successfully sorted customers into tenure groups.\n")
# --- Part 2: Visualize Customer Distribution (Donut Chart) ---
# Let's see how many customers are in each group.
# First, we count the customers in each group.
tenure_distribution = df['tenure_group'].value_counts().reset_index()
tenure_distribution.columns = ['tenure_group', 'customer_count']
# Now, we create the interactive donut chart!
fig_donut = px.pie(tenure_distribution,
names='tenure_group',
values='customer_count',
title='Customer Distribution by Tenure Group',
hole=0.4, # This makes the hole in the middle!
color_discrete_sequence=px.colors.qualitative.Pastel)
# This adds the nice labels inside the chart slices.
fig_donut.update_traces(textposition='inside', textinfo='percent+label')
print("Displaying Donut Chart...")
fig_donut.show()
# --- Part 3: Compare Average Monthly Charges (Bar Chart) ---
# Let's find out if long-time customers pay more per month.
# First, we calculate the average monthly charge for each group.
avg_monthly_charges = df.groupby('tenure_group')['monthlycharges'].mean().round(2).reset_index()
# Now, we build our bar chart.
fig_bar = px.bar(avg_monthly_charges,
x='tenure_group',
y='monthlycharges',
title='Average Monthly Charges by Tenure Group',
text='monthlycharges', # This adds the number labels on the bars
color='tenure_group',
labels={'tenure_group': 'Tenure Group', 'monthlycharges': 'Average Monthly Charge ($)'})
# This makes the labels on the bars look like money.
fig_bar.update_traces(texttemplate='$%{text}', textposition='outside')
fig_bar.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
print("\nDisplaying Bar Chart...")
fig_bar.show()
Data loaded and cleaned. Starting Task 4 visualizations... --- Creating Tenure Groups --- Successfully sorted customers into tenure groups. Displaying Donut Chart...
Displaying Bar Chart...
In [5]:
# --- Task 5: Advanced Analysis ---
# Part 0: SETUP - Get our tools and data ready
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# Ignore warnings for a clean output
warnings.filterwarnings('ignore')
# Set a nice style for our charts
sns.set_style("whitegrid")
# --- Load and Clean the Data (Prerequisite Steps) ---
# We do this one last time to ensure our data is fresh and correct.
df = pd.read_csv(r"C:\Users\himanshu\Downloads\Telco_Customer_Churn_Dataset (3).csv")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.fillna({'TotalCharges': 0}, inplace=True)
df.columns = [col.lower() for col in df.columns]
print("Data loaded and cleaned. Starting Final Analysis...\n")
# --- Part 1: Prepare for Churn Calculation ---
# To make calculating the churn rate easier, we'll turn the 'churn' column (which has "Yes" and "No")
# into a new column with numbers (1 for "Yes" and 0 for "No").
df['churn_numeric'] = df['churn'].apply(lambda x: 1 if x == 'Yes' else 0)
print("--- Prepared data for churn rate analysis ---")
print("Customers who churned are now marked as 1, and those who stayed as 0.\n")
# --- Part 2: Analyze Churn by Demographics ---
# Let's see if churn is different for gender or senior citizens.
print("--- Investigating Demographics... ---")
demographics = ['gender', 'seniorcitizen']
for col in demographics:
plt.figure(figsize=(8, 6))
# A bar plot is perfect for comparing the average churn rate between groups.
sns.barplot(x=col, y='churn_numeric', data=df)
plt.title(f'Churn Rate by {col.capitalize()}', fontsize=16)
plt.ylabel('Churn Rate', fontsize=12)
plt.xlabel(col.capitalize(), fontsize=12)
plt.show()
# --- Part 3: Analyze Churn by Contract and Payment Method ---
# This is where we look for big clues related to the services.
print("\n--- Investigating Contracts and Payment Methods... ---")
contract_payment = ['contract', 'paymentmethod']
for col in contract_payment:
plt.figure(figsize=(10, 6))
# A count plot with 'hue' lets us see the 'Yes' vs 'No' churn bars for each category.
sns.countplot(x=col, hue='churn', data=df)
plt.title(f'Churn Distribution by {col.capitalize()}', fontsize=16)
plt.xlabel(col.capitalize(), fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.xticks(rotation=25, ha='right') # Rotate labels to prevent overlap
plt.tight_layout()
plt.show()
# --- Part 4: Visualize Churn Trends Over Customer Lifecycle ---
# Does churn change the longer someone is a customer? Let's find out!
print("\n--- Investigating Churn Rate over Time (Tenure)... ---")
plt.figure(figsize=(12, 6))
# We can create a line plot to see the trend.
sns.lineplot(data=df, x='tenure', y='churn_numeric')
plt.title('Churn Rate vs. Customer Tenure', fontsize=16)
plt.xlabel('Tenure (Months)', fontsize=12)
plt.ylabel('Churn Rate', fontsize=12)
plt.show()
Data loaded and cleaned. Starting Final Analysis... --- Prepared data for churn rate analysis --- Customers who churned are now marked as 1, and those who stayed as 0. --- Investigating Demographics... ---
--- Investigating Contracts and Payment Methods... ---
--- Investigating Churn Rate over Time (Tenure)... ---